About me

Goals for Today

What is R?

R Windows

RStudio Integrated Development Enviornment

Why I use R and you should too

Basic Operations

1 + 2 + 3
[1] 6
1 + 2 * 3
[1] 7
(1 + 2) * 3
[1] 9
c(0,1,2,3,5,8)
[1] 0 1 2 3 5 8
1:11
 [1]  1  2  3  4  5  6  7  8  9 10 11
1:11 + 3
 [1]  4  5  6  7  8  9 10 11 12 13 14

Assignment

x <- 3
y <- 1:10
z <- c(1,20,3,6)

Evaluation

x
[1] 3
y
 [1]  1  2  3  4  5  6  7  8  9 10
z
[1]  1 20  3  6
x + y
 [1]  4  5  6  7  8  9 10 11 12 13
x * z
[1]  3 60  9 18

Getting help within R

?mean
help(mean)

Types of R Objects

Types of R variables in Data Frames

my.character <- c('Emma','Olivia','Sophia')
my.character
[1] "Emma"   "Olivia" "Sophia"
my.dates <- c(as.Date("2012-04-05"), Sys.Date())
my.dates
[1] "2012-04-05" "2016-10-25"
my.numeric <- c(12,62.3,7,101)
my.numeric
[1]  12.0  62.3   7.0 101.0
my.factor <- factor(c(0,0,0,1,1), labels=c('Female','Male'))
my.factor
[1] Female Female Female Male   Male  
Levels: Female Male
my.logical <- c(TRUE,FALSE,FALSE)
my.logical
[1]  TRUE FALSE FALSE

Vectorized Arithmetic

weight.kg <- c(60, 72, 57, 90, 95, 72)
height.meters <- c(1.75, 1.80, 1.65, 1.90, 1.74, 1.91)
bmi <- weight.kg/height.meters^2
bmi
[1] 19.59184 22.22222 20.93664 24.93075 31.37799 19.73630

Make simple R objects

byear <- c(68,69,71,72)
byear
[1] 68 69 71 72
Boys <- data.frame(kid = c('JHC','CMC','REC','WBC'), 
                   byear = c(68,69,70,71))
Boys
  kid byear
1 JHC    68
2 CMC    69
3 REC    70
4 WBC    71

Access names and types of variables in a data frame

names(Boys)
[1] "kid"   "byear"
str(Boys)
'data.frame':   4 obs. of  2 variables:
 $ kid  : Factor w/ 4 levels "CMC","JHC","REC",..: 2 1 3 4
 $ byear: num  68 69 70 71

Accessing variables in data frames

Boys$kid
[1] JHC CMC REC WBC
Levels: CMC JHC REC WBC
with(Boys, mean(byear))
[1] 69.5

The R Workspace

ls()
 [1] "bmi"           "Boys"          "byear"         "height.meters"
 [5] "my.character"  "my.dates"      "my.factor"     "my.logical"   
 [9] "my.numeric"    "weight.kg"     "x"             "y"            
[13] "z"            
library(foreign)
ls("package:foreign")
 [1] "data.restore"  "lookup.xport"  "read.arff"     "read.dbf"     
 [5] "read.dta"      "read.epiinfo"  "read.mtp"      "read.octave"  
 [9] "read.S"        "read.spss"     "read.ssd"      "read.systat"  
[13] "read.xport"    "write.arff"    "write.dbf"     "write.dta"    
[17] "write.foreign"

Getting data into R

read.table

mytable <- read.table("c:/chuck/NYU/Stat Group/table1.txt", sep=",", header=TRUE)
mytable
  id female inc80 inc81 inc82
1  1      0  5000  5500  6000
2  2      1  2000  2200  3300
3  3      0  3000  2000  1000
str(mytable)
'data.frame':   3 obs. of  5 variables:
 $ id    : int  1 2 3
 $ female: int  0 1 0
 $ inc80 : int  5000 2000 3000
 $ inc81 : int  5500 2200 2000
 $ inc82 : int  6000 3300 1000

read.spss

library(foreign)
titanic <- read.spss("c:/chuck/NYU/Stat Group/titanic.sav", to.data.frame = TRUE)
head(titanic)
  pclass survived     age     fare gender
1  First      Yes 29.0000 211.3375 Female
2  First      Yes  0.9167 151.5500   Male
3  First       No  2.0000 151.5500 Female
4  First       No 30.0000 151.5500   Male
5  First       No 25.0000 151.5500 Female
6  First      Yes 48.0000  26.5500   Male
attributes(titanic)$variable.labels
            pclass           survived                age 
 "Passenger Class"         "Survived"    "Passenger Age" 
              fare             gender 
  "Passenger Fare" "Passenger Gender" 

Read and combine many files

library(openxlsx)
my.files <- list.files("c:/chuck/NYU/Stat Group/R Workshop/Many Files/", pattern = ".xlsx$", full.names=TRUE, recursive=TRUE)
my.files # vector of the file names with full path
[1] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File1.xlsx"
[2] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File2.xlsx"
[3] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File3.xlsx"
[4] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File4.xlsx"
[5] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File5.xlsx"
[6] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File6.xlsx"
[7] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File7.xlsx"
[8] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File8.xlsx"
my.sheets <- lapply(my.files, function(x){getSheetNames(x)}) # list of worksheet names within each MS-Excel workbook
unlist(my.sheets)
[1] "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1"
[8] "Sheet 1"
my.sheets.DF <- data.frame(file = rep(my.files, unlist(lapply(my.sheets, length))), sheet = unlist(my.sheets))
my.sheets.DF
                                                      file   sheet
1 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File1.xlsx Sheet 1
2 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File2.xlsx Sheet 1
3 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File3.xlsx Sheet 1
4 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File4.xlsx Sheet 1
5 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File5.xlsx Sheet 1
6 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File6.xlsx Sheet 1
7 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File7.xlsx Sheet 1
8 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File8.xlsx Sheet 1
my.list <- vector(mode="list", length = nrow(my.sheets.DF)) # empty list to hold each data file
for(i in 1:nrow(my.sheets.DF)){
  my.list[[i]] <- read.xlsx(xlsxFile = as.character(my.sheets.DF$file)[i], sheet = as.character(my.sheets.DF$sheet)[i])  
}
my.DF <- as.data.frame(do.call(rbind, my.list))
dim(my.DF) # 80 observations, 2 variables 
[1] 80  2

Getting data out of R

Data management: Recoding

library(foreign)
library(car)
auto <- read.spss("c:/chuck/NYU/Stat Group/R Workshop/auto.sav", to.data.frame=TRUE)
summary(auto$MPG)
auto$MPG3 <- car::recode(auto$MPG, "lo:18=1; 19:23=2; 24:hi=3")
table(auto$MPG3)

 1  2  3 
27 24 23 

Data management: Merging

Two data frames with a common ID variable

A <- data.frame(famid = c(2,1,3), 
                name = c('Art','Bill','Paul'), 
                inc = c(22,30,25))
B <- data.frame(famid = c(3,1,2), 
                faminc96 = c(75,40,45), 
                faminc97 = c(76,40.5,40.4), 
                faminc98 = c(77,41,45.8))
intersect(names(A), names(B))
[1] "famid"

One Merged Data Frame

A
  famid name inc
1     2  Art  22
2     1 Bill  30
3     3 Paul  25
B
  famid faminc96 faminc97 faminc98
1     3       75     76.0     77.0
2     1       40     40.5     41.0
3     2       45     40.4     45.8
AB <- merge(A, B)
AB
  famid name inc faminc96 faminc97 faminc98
1     1 Bill  30       40     40.5     41.0
2     2  Art  22       45     40.4     45.8
3     3 Paul  25       75     76.0     77.0

Data management: Subsetting

bank <- read.spss("c:/chuck/NYU/Stat Group/bank.sav", to.data.frame = TRUE)
head(bank)
   ID SALBEG   SEX TIME   AGE SALNOW EDLEVEL  WORK          JOBCAT
1 628   8400 Males   81 28.50  16080      16  0.25 College trainee
2 630  24000 Males   73 40.33  41400      16 12.50 Exempt employee
3 632  10200 Males   83 31.08  21960      15  4.08 Exempt employee
4 633   8700 Males   93 31.17  19200      16  1.83 College trainee
5 635  17400 Males   83 41.92  28350      19 13.00 Exempt employee
6 637  12996 Males   80 29.50  27250      18  2.42 College trainee
  MINORITY     SEXRACE EGENDER EMINORIT EGENXMIN BSALCENT   CSALADJ
1    White White males      -1       -1        1  1593.57 13133.489
2    White White males      -1       -1        1 17193.57  9609.089
3    White White males      -1       -1        1  3393.57 15685.289
4    White White males      -1       -1        1  1893.57 15698.789
5    White White males      -1       -1        1 10593.57  8762.489
6    White White males      -1       -1        1  6189.57 15805.485
  SALDELTA     RES_1 GENDER
1     7680 -1013.504      M
2    17400 -4543.634      M
3    11760  1537.635      M
4    10500  1551.686      M
5    10950 -5387.809      M
6    14254  1656.804      M
subset(bank, SALBEG < 4000 & SEX == "Males", select=c(ID,SEX,SALBEG,AGE,SALNOW))
      ID   SEX SALBEG   AGE SALNOW
414  926 Males   3600 53.50  12300
429 1077 Males   3900 29.17   9000
bank[bank$SALBEG < 4000 & bank$SEX == "Males", c(1,3,2,5,6)]
      ID   SEX SALBEG   AGE SALNOW
414  926 Males   3600 53.50  12300
429 1077 Males   3900 29.17   9000

Data management: Reshaping, Wide to Long

mytable <- read.table("c:/chuck/NYU/Stat Group/table1.txt", sep=",", header=TRUE)
mytable
  id female inc80 inc81 inc82
1  1      0  5000  5500  6000
2  2      1  2000  2200  3300
3  3      0  3000  2000  1000
library(dplyr)
library(tidyr)
longtab <- gather(mytable, year, inc, inc80:inc82)
longtab
  id female  year  inc
1  1      0 inc80 5000
2  2      1 inc80 2000
3  3      0 inc80 3000
4  1      0 inc81 5500
5  2      1 inc81 2200
6  3      0 inc81 2000
7  1      0 inc82 6000
8  2      1 inc82 3300
9  3      0 inc82 1000

Data management: Reshaping, Long to Wide

longtab
  id female  year  inc
1  1      0 inc80 5000
2  2      1 inc80 2000
3  3      0 inc80 3000
4  1      0 inc81 5500
5  2      1 inc81 2200
6  3      0 inc81 2000
7  1      0 inc82 6000
8  2      1 inc82 3300
9  3      0 inc82 1000
widetab <- spread(longtab, year, inc)
widetab
  id female inc80 inc81 inc82
1  1      0  5000  5500  6000
2  2      1  2000  2200  3300
3  3      0  3000  2000  1000

Reshaping with data.table

Data management: Aggregating

head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
aggregate(iris[,1:4], list(Species = iris$Species), FUN = mean)
     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1     setosa        5.006       3.428        1.462       0.246
2 versicolor        5.936       2.770        4.260       1.326
3  virginica        6.588       2.974        5.552       2.026

Data Management: Set Operations

x <- c('A','B','C','D')
y <- c('D','A','F','J')
setdiff(x, y) # which elements of x are not in y
[1] "B" "C"
setdiff(y, x) # which elements of y are not in x
[1] "F" "J"
intersect(x, y) # what elements are in both x and y
[1] "A" "D"
union(x, y) # elements in either x or y
[1] "A" "B" "C" "D" "F" "J"
"C" %in% x # is there a "C" in x
[1] TRUE
y %in% c("F","J") # which elements of y are equal to "F" or "J"
[1] FALSE FALSE  TRUE  TRUE

Data Management: Character Manipulation

x <- c('A','B','C','D')
grep('C', x)
[1] 3
gsub('C', 'c', x)
[1] "A" "B" "c" "D"
paste('Item', 1:10, sep="-")
 [1] "Item-1"  "Item-2"  "Item-3"  "Item-4"  "Item-5"  "Item-6"  "Item-7" 
 [8] "Item-8"  "Item-9"  "Item-10"
x <- c('fruit_apple','instrument_guitar','instrument_piano','fruit_orange','instrument_trumpet')
grepl("fruit", x)
[1]  TRUE FALSE FALSE  TRUE FALSE

Data Management: dplyr package

Houston flights data

library(hflights)
head(hflights)
     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
5424 2011     1          1         6    1400    1500            AA
5425 2011     1          2         7    1401    1501            AA
5426 2011     1          3         1    1352    1502            AA
5427 2011     1          4         2    1403    1513            AA
5428 2011     1          5         3    1405    1507            AA
5429 2011     1          6         4    1359    1503            AA
     FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
5424       428  N576AA                60      40      -10        0    IAH
5425       428  N557AA                60      45       -9        1    IAH
5426       428  N541AA                70      48       -8       -8    IAH
5427       428  N403AA                70      39        3        3    IAH
5428       428  N492AA                62      44       -3        5    IAH
5429       428  N262AA                64      45       -7       -1    IAH
     Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
5424  DFW      224      7      13         0                         0
5425  DFW      224      6       9         0                         0
5426  DFW      224      5      17         0                         0
5427  DFW      224      9      22         0                         0
5428  DFW      224      9       9         0                         0
5429  DFW      224      6      13         0                         0
str(hflights)
'data.frame':   227496 obs. of  21 variables:
 $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
 $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
 $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
 $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
 $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
 $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
 $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
 $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
 $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
 $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
 $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
 $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
 $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
 $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
 $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
 $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
 $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
 $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
 $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ CancellationCode : chr  "" "" "" "" ...
 $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...

Using filter() to select observations

head(filter(hflights, Month == 1, UniqueCarrier == "AA"))
  Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011     1          1         6    1400    1500            AA       428
2 2011     1          2         7    1401    1501            AA       428
3 2011     1          3         1    1352    1502            AA       428
4 2011     1          4         2    1403    1513            AA       428
5 2011     1          5         3    1405    1507            AA       428
6 2011     1          6         4    1359    1503            AA       428
  TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
1  N576AA                60      40      -10        0    IAH  DFW      224
2  N557AA                60      45       -9        1    IAH  DFW      224
3  N541AA                70      48       -8       -8    IAH  DFW      224
4  N403AA                70      39        3        3    IAH  DFW      224
5  N492AA                62      44       -3        5    IAH  DFW      224
6  N262AA                64      45       -7       -1    IAH  DFW      224
  TaxiIn TaxiOut Cancelled CancellationCode Diverted
1      7      13         0                         0
2      6       9         0                         0
3      5      17         0                         0
4      9      22         0                         0
5      9       9         0                         0
6      6      13         0                         0

Using select() to select variables

head(select(hflights, DepTime, ArrTime, AirTime))
     DepTime ArrTime AirTime
5424    1400    1500      40
5425    1401    1501      45
5426    1352    1502      48
5427    1403    1513      39
5428    1405    1507      44
5429    1359    1503      45
head(select(hflights, ends_with("Time")))
     DepTime ArrTime ActualElapsedTime AirTime
5424    1400    1500                60      40
5425    1401    1501                60      45
5426    1352    1502                70      48
5427    1403    1513                70      39
5428    1405    1507                62      44
5429    1359    1503                64      45

Using mutate() to create new variables

g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
head(select(g1, ActualElapsedTime, AirTime, ActualGroundTime))
  ActualElapsedTime AirTime ActualGroundTime
1                60      40               20
2                60      45               15
3                70      48               22
4                70      39               31
5                62      44               18
6                64      45               19

Using summarize()

summarize(hflights, min_dist = min(Distance),
                    mean_dist = mean(Distance),
                    median_dist = median(Distance),
                    sd_dist = sd(Distance),
                    max_dist = max(Distance))
  min_dist mean_dist median_dist  sd_dist max_dist
1       79  787.7832         809 453.6806     3904

Using group_by() and summarize() to summarize by group

hflights %>% group_by(UniqueCarrier) %>%
summarize(min_dist = min(Distance),
          mean_dist = mean(Distance),
          median_dist = median(Distance),
          sd_dist = sd(Distance),
          max_dist = max(Distance))
# A tibble: 15 × 6
   UniqueCarrier min_dist mean_dist median_dist    sd_dist max_dist
           <chr>    <int>     <dbl>       <dbl>      <dbl>    <int>
1             AA      224  483.8212         224 353.269167      964
2             AS     1874 1874.0000        1874   0.000000     1874
3             B6     1428 1428.0000        1428   0.000000     1428
4             CO      140 1098.0549        1190 505.204266     3904
5             DL      689  723.2832         689 103.886047     1076
6             EV      468  775.6815         696 259.664313     1076
7             F9      666  882.7411         883   7.496141      883
8             FL      490  685.4063         696  45.508796      696
9             MQ      247  650.5310         247 447.617384     1379
10            OO      140  819.7279         809 299.852214     1428
11            UA      643 1177.8388         925 326.355580     1635
12            US      912  981.4677         913 110.098250     1325
13            WN      148  606.6218         453 399.144719     1642
14            XE       79  589.0326         562 280.514799     1201
15            YV      912  938.6709         913  79.603621     1190

Notice group_by() and summarize() essentially aggregate

iris %>% 
  group_by(Species) %>%
  summarize(Sepal.Length = mean(Sepal.Length),
            Sepal.Width = mean(Sepal.Width),
            Petal.Length = mean(Petal.Length),
            Petal.Width = mean(Petal.Width))
# A tibble: 3 × 5
     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
      <fctr>        <dbl>       <dbl>        <dbl>       <dbl>
1     setosa        5.006       3.428        1.462       0.246
2 versicolor        5.936       2.770        4.260       1.326
3  virginica        6.588       2.974        5.552       2.026

Putting data into a desired order with arrange()

hflights %>% group_by(UniqueCarrier) %>%
summarize(min_dist = min(Distance),
          mean_dist = mean(Distance),
          median_dist = median(Distance),
          sd_dist = sd(Distance),
          max_dist = max(Distance)) %>%
arrange(mean_dist)
# A tibble: 15 × 6
   UniqueCarrier min_dist mean_dist median_dist    sd_dist max_dist
           <chr>    <int>     <dbl>       <dbl>      <dbl>    <int>
1             AA      224  483.8212         224 353.269167      964
2             XE       79  589.0326         562 280.514799     1201
3             WN      148  606.6218         453 399.144719     1642
4             MQ      247  650.5310         247 447.617384     1379
5             FL      490  685.4063         696  45.508796      696
6             DL      689  723.2832         689 103.886047     1076
7             EV      468  775.6815         696 259.664313     1076
8             OO      140  819.7279         809 299.852214     1428
9             F9      666  882.7411         883   7.496141      883
10            YV      912  938.6709         913  79.603621     1190
11            US      912  981.4677         913 110.098250     1325
12            CO      140 1098.0549        1190 505.204266     3904
13            UA      643 1177.8388         925 326.355580     1635
14            B6     1428 1428.0000        1428   0.000000     1428
15            AS     1874 1874.0000        1874   0.000000     1874

Benefits of Piping

Without Piping

arrange(
  summarize(
    group_by(
      filter(mtcars, carb > 1),
    cyl),
  Avg_mpg = mean(mpg)
  ),
desc(Avg_mpg)  
  )
# A tibble: 3 × 2
    cyl Avg_mpg
  <dbl>   <dbl>
1     4   25.90
2     6   19.74
3     8   15.10

With Piping

mtcars %>%
  filter(carb > 1) %>%
  group_by(cyl) %>%
  summarize(Avg_mpg = mean(mpg)) %>%
  arrange(desc(Avg_mpg))
# A tibble: 3 × 2
    cyl Avg_mpg
  <dbl>   <dbl>
1     4   25.90
2     6   19.74
3     8   15.10

Plots: Histogram

hsb2 <- read.table('http://www.ats.ucla.edu/stat/r/modules/hsb2.csv', header=T, sep=",")
library(ggplot2)
ggplot(hsb2, aes(x = write)) + geom_histogram()

Plots: Bar

library(openxlsx)
fname <- read.xlsx("c:/chuck/NYU/Stat Group/R Workshop/Female-Names.xlsx", sheet = 1)
fname
   Rank      Name Ngirls
1     1      Emma  20799
2     2    Olivia  19674
3     3    Sophia  18490
4     4  Isabella  16950
5     5       Ava  15586
6     6       Mia  13442
7     7     Emily  12562
8     8   Abigail  11985
9     9   Madison  10247
10   10 Charlotte  10048
ggplot(fname, aes(x = Name, y = Ngirls)) + 
  geom_bar(stat = 'identity') + 
  ylab('Number') + xlab('') + 
  ggtitle('Ten Most Common Female Baby\nNames in 2014') +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

Plots: Scatter

ggplot2 package

ggplot(hsb2, aes(x = write, y = read)) + geom_point()

Base R

plot(read ~ write, data = hsb2)

Plots: Scatter Enhanced

library(ggplot2)
ggplot(subset(bank, SALNOW < 30000), aes(x = SALBEG, y = SALNOW, colour = GENDER)) + 
  geom_point() + scale_y_continuous(limits=c(0,30000)) + scale_x_continuous(limits=c(0,30000)) 

Plots: Scatter with Facets

library(ggplot2)
ggplot(subset(bank, SALNOW < 30000), aes(x = SALBEG, y = SALNOW)) + 
  geom_point() + scale_y_continuous(limits=c(0,30000)) + scale_x_continuous(limits=c(0,30000)) + facet_wrap( ~ GENDER)

Plots: Boxplots

hsb <- read.dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
ggplot(hsb, aes(x = female, y = write)) + geom_boxplot() + xlab("")

ggplot(hsb, aes(x = female, y = write)) + geom_boxplot() + xlab("") + coord_flip()

Plots: Alpha Transparency

ggplot(myDF, aes(x=X,y=Y)) + geom_point(color="blue", size=3)

ggplot(myDF, aes(x=X,y=Y)) + geom_point(color="blue", size=3, alpha =.20)

Plots: Networks

library(igraph)
net.bg <- barabasi.game(80) 
V(net.bg)$frame.color <- sample(c("white","black"), size = 80, replace=TRUE, prob=c(0.5,0.5))
V(net.bg)$color <- sample(c("orange","lightblue"), size = 80, replace=TRUE, prob=c(0.5,0.5))
V(net.bg)$label <- "" 
V(net.bg)$size <- 10
E(net.bg)$arrow.mode <- 0
plot(net.bg, layout=layout.lgl)

Plots: Maps

Plots: More customized

library(ggplot2) # devtools::install_github("hadley/ggplot2")
library(ggalt)   # devtools::install_github("hrbrmstr/ggalt")
library(dplyr)   # for data_frame() & arrange()

df <- data_frame(country=c("Germany", "France", "Vietnam", "Japan", "Poland", "Lebanon",
                           "Australia", "South\nKorea", "Canada", "Spain", "Italy", "Peru",
                           "U.S.", "UK", "Mexico", "Chile", "China", "India"),
                 ages_35=c(0.39, 0.42, 0.49, 0.43, 0.51, 0.57,
                           0.60, 0.45, 0.65, 0.57, 0.57, 0.65,
                           0.63, 0.59, 0.67, 0.75, 0.52, 0.48),
                 ages_18_to_34=c(0.81, 0.83, 0.86, 0.78, 0.86, 0.90,
                                 0.91, 0.75, 0.93, 0.85, 0.83, 0.91,
                                 0.89, 0.84, 0.90, 0.96, 0.73, 0.69),
                 diff=sprintf("+%d", as.integer((ages_18_to_34-ages_35)*100)))

df <- arrange(df, desc(diff))
df$country <- factor(df$country, levels=rev(df$country))

percent_first <- function(x) {
  x <- sprintf("%d%%", round(x*100))
  x[2:length(x)] <- sub("%$", "", x[2:length(x)])
  x
}

gg <- ggplot()
gg <- gg + geom_segment(data=df, aes(y=country, yend=country, x=0, xend=1), color="#b2b2b2", size=0.15)
gg <- gg + geom_segment(data=df, aes(y=country, yend=country, x=ages_35, xend=ages_18_to_34), color="#b2b2b2", size=2)
gg <- gg + geom_point(data=df, aes(x=ages_35, y = country), size=3, colour = "#9fb059")
gg <- gg + geom_point(data=df, aes(x=ages_18_to_34, y = country), size=3, colour = "#edae52")
gg <- gg + geom_text(data=dplyr::filter(df, country=="Germany"),
                     aes(x=ages_35, y=country, label="Ages 35+"),
                     color="#9fb059", size=3, vjust=-1, fontface="bold", family="Calibri")
gg <- gg + geom_text(data=dplyr::filter(df, country=="Germany"),
                     aes(x=ages_18_to_34, y=country, label="Ages 18-34"),
                     color="#edae52", size=3, vjust=-1, fontface="bold", family="Calibri")
gg <- gg + geom_text(data=df, aes(x=ages_35, y=country, label=percent_first(ages_35)),
                     color="#9fb059", size=2.75, vjust=1.75, family="Calibri")
gg <- gg + geom_text(data=df, color="#edae52", size=2.75, vjust=1.75, family="Calibri",
                     aes(x=ages_18_to_34, y=country, label=percent_first(ages_18_to_34)))
gg <- gg + geom_rect(data=df, aes(xmin=1.05, xmax=1.175, ymin=-Inf, ymax=Inf), fill="#efefe3")
gg <- gg + geom_text(data=df, aes(label=diff, y=country, x=1.1125), fontface="bold", size=3, family="Calibri")
gg <- gg + geom_text(data=filter(df, country=="Germany"), aes(x=1.1125, y=country, label="DIFF"),
                     color="#7a7d7e", size=3.1, vjust=-2, fontface="bold", family="Calibri")
gg <- gg + scale_x_continuous(expand=c(0,0), limits=c(0, 1.175))
gg <- gg + scale_y_discrete(expand=c(0.075,0))
gg <- gg + labs(x=NULL, y=NULL, title="The social media age gap",
                subtitle="Adult internet users or reported smartphone owners who\nuse social networking sites",
                caption="Source: Pew Research Center, Spring 2015 Global Attitudes Survey. Q74")
gg <- gg + theme_bw(base_family="Calibri")
gg <- gg + theme(panel.grid.major=element_blank())
gg <- gg + theme(panel.grid.minor=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold"))
gg <- gg + theme(plot.subtitle=element_text(face="italic", size=9, margin=margin(b=12)))
gg <- gg + theme(plot.caption=element_text(size=7, margin=margin(t=12), color="#7a7d7e"))
gg

Analysis: descriptive statistics

hsb <- read.dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
summary(hsb)
       id            female              race         ses    
 Min.   :  1.00   male  : 91   hispanic    : 24   low   :47  
 1st Qu.: 50.75   female:109   asian       : 11   middle:95  
 Median :100.50                african-amer: 20   high  :58  
 Mean   :100.50                white       :145              
 3rd Qu.:150.25                                              
 Max.   :200.00                                              
     schtyp          prog          read           write      
 public :168   general : 45   Min.   :28.00   Min.   :31.00  
 private: 32   academic:105   1st Qu.:44.00   1st Qu.:45.75  
               vocation: 50   Median :50.00   Median :54.00  
                              Mean   :52.23   Mean   :52.77  
                              3rd Qu.:60.00   3rd Qu.:60.00  
                              Max.   :76.00   Max.   :67.00  
      math          science          socst      
 Min.   :33.00   Min.   :26.00   Min.   :26.00  
 1st Qu.:45.00   1st Qu.:44.00   1st Qu.:46.00  
 Median :52.00   Median :53.00   Median :52.00  
 Mean   :52.65   Mean   :51.85   Mean   :52.41  
 3rd Qu.:59.00   3rd Qu.:58.00   3rd Qu.:61.00  
 Max.   :75.00   Max.   :74.00   Max.   :71.00  

Analysis: frequencies

library(descr)
with(bank, freq(JOBCAT, plot=FALSE))
JOBCAT 
                 Frequency Percent
Clerical               227  47.890
Office trainee         136  28.692
Security officer        27   5.696
College trainee         41   8.650
Exempt employee         32   6.751
MBA trainee              5   1.055
Technical                6   1.266
Total                  474 100.000

Analysis: crosstabulation

library(descr)
with(bank, CrossTable(SEX, JOBCAT %in% c("Clerical","Office trainee"), 
                    prop.r=TRUE, prop.chi=FALSE, prop.t=FALSE, prop.c=FALSE, chisq=TRUE))
.    Cell Contents 
. |-------------------------|
. |                   Count | 
. |             Row Percent | 
. |-------------------------|
. 
. ================================
.            JOBCAT %in% c("Clerical", "Office trainee")
. SEX        FALSE    TRUE   Total
. --------------------------------
. Males       101     157     258 
.            39.1%   60.9%   54.4%
. --------------------------------
. Females      10     206     216 
.             4.6%   95.4%   45.6%
. --------------------------------
. Total       111     363     474 
. ================================
. 
. Statistics for All Table Factors
. 
. Pearson's Chi-squared test 
. ------------------------------------------------------------
. Chi^2 = 78.10967      d.f. = 1      p <2e-16 
. 
. Pearson's Chi-squared test with Yates' continuity correction 
. ------------------------------------------------------------
. Chi^2 = 76.19681      d.f. = 1      p <2e-16 
.         Minimum expected frequency: 50.58228

More detailed descriptive statistics for quantitative variables

library(psych)
psych::describe(subset(hsb, select=c(read,write,math,science,socst)))
        vars   n  mean    sd median trimmed   mad min max range  skew
read       1 200 52.23 10.25     50   52.03 10.38  28  76    48  0.19
write      2 200 52.77  9.48     54   53.36 11.86  31  67    36 -0.47
math       3 200 52.65  9.37     52   52.23 10.38  33  75    42  0.28
science    4 200 51.85  9.90     53   52.02 11.86  26  74    48 -0.19
socst      5 200 52.41 10.74     52   52.99 13.34  26  71    45 -0.38
        kurtosis   se
read       -0.66 0.72
write      -0.78 0.67
math       -0.69 0.66
science    -0.60 0.70
socst      -0.57 0.76

Analysis: t-tests

hsb <- read.dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
with(hsb, t.test(write, mu=50))

    One Sample t-test

data:  write
t = 4.1403, df = 199, p-value = 5.121e-05
alternative hypothesis: true mean is not equal to 50
95 percent confidence interval:
 51.45332 54.09668
sample estimates:
mean of x 
   52.775 
with(hsb, t.test(write, read, paired = TRUE))

    Paired t-test

data:  write and read
t = 0.86731, df = 199, p-value = 0.3868
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.6941424  1.7841424
sample estimates:
mean of the differences 
                  0.545 
t.test(write ~ female, data = hsb, var.equal = TRUE)

    Two Sample t-test

data:  write by female
t = -3.7341, df = 198, p-value = 0.0002463
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -7.441835 -2.298059
sample estimates:
  mean in group male mean in group female 
            50.12088             54.99083 
t.test(write ~ female, data = hsb)

    Welch Two Sample t-test

data:  write by female
t = -3.6564, df = 169.71, p-value = 0.0003409
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -7.499159 -2.240734
sample estimates:
  mean in group male mean in group female 
            50.12088             54.99083 

Analysis: Correlation

hsb <- read.spss("c:/chuck/NYU/Stat Group/R Workshop/hsb2.sav", to.data.frame=TRUE)
round(cor(subset(hsb, select=c(READ,WRITE,MATH,SCIENCE,SOCST))), 2)
        READ WRITE MATH SCIENCE SOCST
READ    1.00  0.60 0.66    0.63  0.62
WRITE   0.60  1.00 0.62    0.57  0.60
MATH    0.66  0.62 1.00    0.63  0.54
SCIENCE 0.63  0.57 0.63    1.00  0.47
SOCST   0.62  0.60 0.54    0.47  1.00
cor.test(~ READ + SOCST, data = hsb)

    Pearson's product-moment correlation

data:  READ and SOCST
t = 11.163, df = 198, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5282957 0.6998781
sample estimates:
      cor 
0.6214843 

Analysis: Linear Regression

hsb <- read.spss("c:/chuck/NYU/Stat Group/R Workshop/hsb2.sav", to.data.frame=TRUE)
head(hsb)
   ID FEMALE  RACE    SES SCHTYP     PROG READ WRITE MATH SCIENCE SOCST
1  70   male white    low public  general   57    52   41      47    57
2 121 female white middle public vocation   68    59   53      63    61
3  86   male white   high public  general   44    33   54      58    31
4 141   male white   high public vocation   63    44   47      53    56
5 172   male white middle public academic   47    52   57      53    61
6 113   male white middle public academic   44    52   51      63    61
lm.1 <- lm(SCIENCE ~ MATH + FEMALE + SOCST + READ, data = hsb)
summary(lm.1)

Call:
lm(formula = SCIENCE ~ MATH + FEMALE + SOCST + READ, data = hsb)

Residuals:
     Min       1Q   Median       3Q      Max 
-18.6706  -4.5764  -0.3237   4.5006  21.8563 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  12.32529    3.19356   3.859 0.000154 ***
MATH          0.38931    0.07412   5.252 3.92e-07 ***
FEMALEfemale -2.00976    1.02272  -1.965 0.050820 .  
SOCST         0.04984    0.06223   0.801 0.424139    
READ          0.33530    0.07278   4.607 7.36e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.148 on 195 degrees of freedom
Multiple R-squared:  0.4892,    Adjusted R-squared:  0.4788 
F-statistic: 46.69 on 4 and 195 DF,  p-value: < 2.2e-16
confint(lm.1)
                   2.5 %      97.5 %
(Intercept)   6.02694246 18.62363584
MATH          0.24312201  0.53549835
FEMALEfemale -4.02677214  0.00724283
SOCST        -0.07288986  0.17257843
READ          0.19176512  0.47883447

Analysis: Logistic Regression

Variable Description Codes/Values
LOW Low Birth Weight 1 = BWT<=2500g, 0 = BWT>2500g
AGE Age of Mother Years
LWT Weight of Mother at Last Menstrual Period Pounds
RACE Race 1=White, 2=Black, 3=Other
FTV First Trimester Physician Visits 0=None, 1=One, 2=Two,etc.
lowbwt <- read.spss('c:/chuck/NYU/Stat Group/R Workshop/lowbwt.sav', to.data.frame=TRUE)
lowbwt$RACE <- car::recode(lowbwt$RACE, "1='White';2='Black';3='Other'", as.factor.result = TRUE)
lowbwt$RACE <- relevel(lowbwt$RACE, ref = 'White')
lrm.1 <- glm(LOW ~ AGE + LWT + RACE + FTV, family='binomial', data = lowbwt)
summary(lrm.1)

Call:
glm(formula = LOW ~ AGE + LWT + RACE + FTV, family = "binomial", 
    data = lowbwt)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4163  -0.8931  -0.7113   1.2454   2.0755  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)  
(Intercept)  1.295366   1.071443   1.209   0.2267  
AGE         -0.023823   0.033730  -0.706   0.4800  
LWT         -0.014245   0.006541  -2.178   0.0294 *
RACEBlack    1.003898   0.497859   2.016   0.0438 *
RACEOther    0.433108   0.362240   1.196   0.2318  
FTV         -0.049308   0.167239  -0.295   0.7681  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 234.67  on 188  degrees of freedom
Residual deviance: 222.57  on 183  degrees of freedom
AIC: 234.57

Number of Fisher Scoring iterations: 4
exp(coef(lrm.1)[-1]) # Removing constant
      AGE       LWT RACEBlack RACEOther       FTV 
0.9764586 0.9858564 2.7288979 1.5420434 0.9518876 
exp(confint(lrm.1)[-1,]) # Removing constant from confidence intervals too
              2.5 %    97.5 %
AGE       0.9124232 1.0421301
LWT       0.9725636 0.9979524
RACEBlack 1.0232217 7.3225967
RACEOther 0.7567464 3.1462285
FTV       0.6771508 1.3109248

Download and Plot Fitbit Data - All in R

library(fitbitScraper)
library(scales)
library(ggplot2)

source("c:/chuck/NYU/Stat Group/R Workshop/fitbit-cookie.r") 
df <- get_daily_data(cookie, what="distance", 
                     start_date="2014-01-01", 
                     end_date=as.character(Sys.Date()))

fig1 <- ggplot(subset(df, distance > 0), 
               aes(x=as.Date(time), y= distance, 
                   colour = as.Date(time) > "2015-01-01")) + 
  geom_point(size=2, shape=1) + ylab("Miles") + xlab("") + 
  stat_smooth(method='loess', se=FALSE) + 
  scale_y_continuous(breaks=seq(0,16,1)) +
  scale_x_date(breaks = date_breaks("2 months"), 
               labels = date_format("%b-%Y")) +
  scale_colour_manual(name = "After 1/1/15", 
                      values = c("#6B8E23","#8A2BE2")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Extracting Data from Google Scholar

library(scholar)
library(purrr)

DF <- data.frame(Person = c("Chuck Cleland","Sam Friedman","Holly Hagan","Dani Ompad"),
                 GID = c("AuiR_rAAAAAJ","bQwCIGQAAAAJ","67LUQJUAAAAJ","Zxdn7DwAAAAJ"))

do.call(rbind, map(map(DF$GID, get_profile), 
                   function(x){data.frame(Person = x$name,
                                          Total.Cites = x$total_cites,
                                          h.index = x$h_index,
                                          i10.index = x$i10_index)}))
              Person Total.Cites h.index i10.index
1 Charles M. Cleland        3228      29        77
2  Samuel R Friedman       21374      78       295
3        Holly Hagan       16430      49       138
4     Danielle Ompad        3357      33        68

Generating Non-Random Data

seq(1, 10, 2)
[1] 1 3 5 7 9
104:114
 [1] 104 105 106 107 108 109 110 111 112 113 114
rep(c('A','B','C'), 3)
[1] "A" "B" "C" "A" "B" "C" "A" "B" "C"
rep(c('A','B','C'), each = 3)
[1] "A" "A" "A" "B" "B" "B" "C" "C" "C"
seq(1, 4, len = 6)
[1] 1.0 1.6 2.2 2.8 3.4 4.0
expand.grid(Person = 1:5, Visit = 1:3)
   Person Visit
1       1     1
2       2     1
3       3     1
4       4     1
5       5     1
6       1     2
7       2     2
8       3     2
9       4     2
10      5     2
11      1     3
12      2     3
13      3     3
14      4     3
15      5     3

Generating Random Data

rnorm(10)
 [1]  2.0665795 -2.7631017  0.1590837  0.3340059  0.9715169 -1.3661629
 [7] -0.4248921  0.2323954  0.4803709  1.4822548
runif(10, min=100,max=999)
 [1] 772.3865 812.8910 291.6639 813.0696 635.7335 919.6467 722.3682
 [8] 871.1754 202.0830 628.5196
rbinom(10, 1, .5)
 [1] 0 1 1 0 1 0 1 0 1 1
sample(letters[1:3], size = 30, replace=TRUE, prob = c(.8,.1,.1))
 [1] "a" "a" "a" "a" "a" "a" "b" "c" "a" "a" "a" "b" "a" "a" "b" "a" "a"
[18] "a" "a" "a" "b" "b" "b" "a" "a" "a" "b" "a" "c" "a"
sample(10)
 [1]  2  8  7  3  6  5  4  9  1 10

More on R Packages

Where and how to find R packages

How to install and load R packages

install.packages("epiR")
library(epiR)
head(ls("package:epiR"), n = 30)
 [1] "epi.2by2"         "epi.about"        "epi.asc"         
 [4] "epi.betabuster"   "epi.bohning"      "epi.ccc"         
 [7] "epi.ccsize"       "epi.cluster1size" "epi.cluster2size"
[10] "epi.clustersize"  "epi.cohortsize"   "epi.conf"        
[13] "epi.convgrid"     "epi.cp"           "epi.cpresids"    
[16] "epi.descriptives" "epi.detectsize"   "epi.dgamma"      
[19] "epi.directadj"    "epi.dms"          "epi.dsl"         
[22] "epi.edr"          "epi.empbayes"     "epi.equivb"      
[25] "epi.equivc"       "epi.herdtest"     "epi.indirectadj" 
[28] "epi.insthaz"      "epi.interaction"  "epi.iv"          

Resources to Learn More

Local R Interest Group?

That’s it for today, and thank you!